import logging
from contextlib import closing
from store import ds, seoinclude_days, url_host, m_url_host, url_path, get_baidu_config
import datetime
import time
from time import sleep
import requests
from article.util import *
import json
from app.helpers import jsonDataToUrlParams,https_request_json
from util import str_to_timestamp
from bs4 import BeautifulSoup
import sys
import random

# 查看文章对应url是否被收录

# 从http://www.forshine.net/baike/tiyuzhishi/list_781_1.html
# 爬取数据 存入article表
# 2020-09-09
def run(): 
    # global list_ip 
    # list_ip = getProxy()
    # print('list_ip: ', list_ip)
    # 分页有 781_ 1 到  53
    for page in range(1,54):
        # 因为是使用代理.一定要处理异常.一旦出现异常,比如407 403 这都是因为ip 不可用.低质量.
        # 直接重新请求

        doc = get('http://www.forshine.net/baike/tiyuzhishi/list_781_'+str(page)+'.html' , lambda resp: BeautifulSoup(resp.text, 'lxml'))
        # doc = get('http://localhost/tiyu/体育知识体育知识大全.html' , lambda resp: BeautifulSoup(resp.text, 'lxml'))
        print('doc: ', doc)
        if doc is None:
            continue
        
        # lxml 用法。返回的数据，就是美化后的 直接使用其下的类和方法
        # https://www.cnblogs.com/dan-baishucaizi/p/8494913.html

        # 第一： 
        # 这里找到 class 为 end-list 下的日期

        matchDate = doc.select('.listtagT')

        # print('matchDate: ', matchDate)

        hrefs = []

        for li in matchDate:
            
            a = li.select_one('.fl').attrs['href']
            # 爬取详情页数据
            res = getDetail(a)
            print('res: ', res)

        # save_videos(videos)


        exit()

def getDetail(url):
    # 判断 路由是否全
    host = 'http://www.forshine.net'
    if 'http://' not in url:
        url = host + url

    # 这里因为乱码太强. 只能先get 再去执行 encoding
    # 强行转码第二招
    # doc = get('http://localhost/tiyu/详情页/马拉松跑全程是多少千米(公里)_马拉松跑全程大约要用多长时间%20-阳光直播吧.html','')
    doc = get(url,'')
    # doc = get(url , lambda resp: BeautifulSoup(resp.text, 'lxml'))
    # print('我的详情页: ', doc)
    # 强行指定返回的数据的类型
    doc.encoding = 'GB2312' 
    # 然后再进行 beautiful 即可中文
    doc = BeautifulSoup(doc.text, 'lxml')
    if doc is None:
        return '请求当前页面是空'+ url
    # 抓取title
    title = str(doc.select_one('.h>h3'))
    print('title: ', title)
    # 获取所有的p 标签 返回一个列表
    article = doc.find_all('p')
    print('article: ', article)
    
    # join 中有数字 所以%s
    content = ''.join('%s' %id for id in article[3:]) # 前面3个p 都是废话 不需要
    # print('content: ', content)
    # exit()
    # 把当前的页面组成存入mysql 这里无所谓效率了 反正本地执行
    # 参考 update_video
     # 批量存入数据表article
    with closing(ds.get_connection()) as conn, closing(conn.cursor()) as cur:
        cur.execute(
            'INSERT IGNORE INTO article_baike(title, category, content, type, state) VALUES (%s, %s, %s, %s, %s)',
            (title, 1, content, 'qita', 1)
        )
        conn.commit()

    return '爬取详情页成功'

def get(url, transform):
    headers = {
            'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 '
                        'Safari/537.36 ',
            'Referer': 'http://www.forshine.net/baike/tiyuzhishi/list_781_2.html',
            'Upgrade-Insecure-Requests': 1,
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'no-cache',
            'Cookie': '__cdnuid_h=aa520427e9bc23222a8aa94ba6ffdb32; Hm_lvt_6cda846867fd596b2ca7a177bac2039f=1599638243; cck_lasttime=1599647476070; cck_count=0; Hm_lpvt_6cda846867fd596b2ca7a177bac2039f=1599709422',
            'Host': 'www.forshine.net',
            'Pragma': 'no-cache',
            'Proxy-Connection': 'keep-alive'
    }
    # 加载我的代理池
    # 调用接口得到代理
    ip_list = getProxy()
    # 从代理中随机抓取一个来用
    proxy = get_random_ip(ip_list)
    # 加上代理来访问
    resp = requests.get(url, headers,proxies=proxy,timeout=10)
    print(resp.url, resp.status_code)
    # 爬取如果出现乱码 就打开下面的代码
    # add by bobo 2020-09-10
    print(resp.encoding) #查看网页返回的字符集类型 这里返回ISO-8859 
    print(resp.apparent_encoding) #自动判断字符集类型 发现这里返回 GB2312! 我们进行转换gbk 

    # html = resp.text.encode('iso-8859-1').decode('gbk')
    # print('html: ', html)
    # 爬取如果出现乱码 就打开上面的代码

    # test
    # if resp.apparent_encoding == 'GB2312':
    #     html = resp.text.encode('iso-8859-1').decode('gbk')
    #     print('html: ', html)
    #     return html
    if transform:
        return transform(resp) if resp.status_code == requests.codes.ok else None
    else:
        return resp if resp.status_code == requests.codes.ok else None


def getProxy():
    url = 'https://proxyapi.horocn.com/api/v2/proxies?order_id=TIPR1677438793501643&num=10&format=json&line_separator=win&can_repeat=no&user_token=64f45ce8608c4177fcc297d053f4e458'
    list_ip = get_ip_list(url)
    sleep(10)# 防止返回异常 {'code': 10001, 'msg': '调用频率过快', 'data': []}
    # 注意需要保存返回的iplist
    return list_ip

# 筛选ip list
def get_ip_list(url):
    web_data = https_request_json(url)
    print('web_data: ', web_data)
    json_data = json.loads(web_data.text)
    print('json_data: ', json_data)
    
    # soup = BeautifulSoup(web_data.text, 'html')
    # ips = soup.find_all('tr')
    ip_list = []
    for i in json_data['data']:
        ip_list.append(i['host'] + ':' + i['port'])
    # print('ip_list: ', ip_list)
    # exit()    
    #检测ip可用性，移除不可用ip：（这里其实总会出问题，你移除的ip可能只是暂时不能用，剩下的ip使用一次后可能之后也未必能用）
    for ip in ip_list:
        try:
          proxy_host = "http://" + ip
          proxy_temp = {"http": proxy_host}
          res = urllib.urlopen(url, proxies=proxy_temp).read()
        except Exception as e:
          ip_list.remove(ip)
          continue
    return ip_list

# 从ip池中随机获取ip列表
def get_random_ip(ip_list):
    proxy_list = []
    for ip in ip_list:
        proxy_list.append('http://' + ip)
    proxy_ip = random.choice(proxy_list)
    proxies = {'http': proxy_ip}
    return proxies
